Advanced Data Visualization Theory DA-1 Prashanth.S 19MID0020

Data-sets used data-set1 : https://raw.githubusercontent.com/ScienceParkStudyGroup/r-lesson-based-on-ohi-data-training/gh-pages/data/ca.csv

dat-set 2: https://www.kaggle.com/code/adhok93/zomato-eda-in-r/data?select=zomato.csv

Importing the libraries

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
require(devtools)
## Loading required package: devtools
## Loading required package: usethis

US National Parks data-set

Importing the data-set

# National Parks in California
ca = read_csv("https://raw.githubusercontent.com/ScienceParkStudyGroup/r-lesson-based-on-ohi-data-training/gh-pages/data/ca.csv") 
## Rows: 789 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): region, state, code, park_name, type
## dbl (2): visitors, year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(ca)

Analysing the data-type of the data

colnames(ca)
## [1] "region"    "state"     "code"      "park_name" "type"      "visitors" 
## [7] "year"
str(ca)
## spec_tbl_df [789 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ region   : chr [1:789] "PW" "PW" "PW" "PW" ...
##  $ state    : chr [1:789] "CA" "CA" "CA" "CA" ...
##  $ code     : chr [1:789] "CHIS" "CHIS" "CHIS" "CHIS" ...
##  $ park_name: chr [1:789] "Channel Islands National Park" "Channel Islands National Park" "Channel Islands National Park" "Channel Islands National Park" ...
##  $ type     : chr [1:789] "National Park" "National Park" "National Park" "National Park" ...
##  $ visitors : num [1:789] 1200 1500 1600 300 15700 ...
##  $ year     : num [1:789] 1963 1964 1965 1966 1967 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   region = col_character(),
##   ..   state = col_character(),
##   ..   code = col_character(),
##   ..   park_name = col_character(),
##   ..   type = col_character(),
##   ..   visitors = col_double(),
##   ..   year = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Plot-1 –> ggplot2

Bar Plot

labs = c(
         'CHIS'='Channel Islands National Park',
         'DEVA'='Death Valley National Park',
         'JOTR'='Joshua Tree National Park',
         'KICA'='Kings Canyon National Park',
         'LAVO'='Lassen Volcanic National Park',
         'YOSE'='Yosemite National Park',
         'SEQU'='Sequoia National Park',
         'REDW'='Redwood National Park',
         'PINN'='Pinnacles National Park'
         )
bar_plot = ggplot(data=ca, aes(x = code, y = ..count.. / sum(..count..),fill = factor(code))) +
           geom_bar(color='black') + 
           labs(x = "National Parks", y = "Percentage of National Parks in the data-set", 
                title = "Occurence of the National Parks in the data-set") +
  
           scale_x_discrete(labels =labs)
           scale_y_continuous(labels = scales::percent)
## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1
bar_plot + coord_flip()

Yosemite National Parks and Sequoia National Park occurs more often in the data-set

Kernel Density Plot

density_plot = ggplot(data=ca, aes(x =log10(visitors))) +
                geom_density(fill = "indianred3") +
                labs(x = "Number of visitors", y="density", title="Kernal density of Visitors")
density_plot       

## Plot-2 –> plotly ### Scatter plot

t <- list(family = "Helvetica",size = 14,color = "blue")
t1 <- list(family = "Times New Roman",color = "red")
t2 <- list(family = "Courier New",size = 14,color = "green")
t3 <- list(family = 'Arial')
scatter_plot = plot_ly(data=ca, x=~year, y=~visitors,color = ~park_name, type='scatter',mode='markers') %>%
                       layout(
                          title= list(text = "<b>Year and Visitors"),
                          legend = list(title = list(text ='<b>National Parks')), 
                          xaxis  = list(title = list(text ='<b>Year')),
                          yaxis  = list(title = list(text ='<b>Visitors')),
                          plot_bgcolor='#e5ecf6')
scatter_plot
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

Yosemite National Park secures a more number of visitors than any-other national parks.

Barchart

r_group_barchart = data.table::melt(ca, id.vars='park_name') %>%
plot_ly(x = ~park_name, y = ~value, type = 'bar', name = ~variable, color = ~variable) %>%
      layout(
          title= list(text = "<b>Total Distribution based on the Data-Set"),
          legend = list(title = list(text= '<b>Attribute')),
          xaxis  = list(title = list(text ='<b>Parks')),
          yaxis = list(title='Count', text='<b>Count'), barmode = 'group')
## Warning in data.table::melt(ca, id.vars = "park_name"): The melt generic in
## data.table has been passed a spec_tbl_df and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace like reshape2::melt(ca). In the next version, this warning will become
## an error.
r_group_barchart

Kings Canyon National Park secures the top in terms of region, state, national park.

Pie-chart

df_order = data.frame(table(ca$park_name))
df_order
pie_chart = plot_ly(type='pie', labels=df_order$Var1, values=df_order$Freq, 
                    textinfo='label+percent',insidetextorientation='radial') %>%
                    layout(
                          title= list(text = "<b>Order Distributions"),
                          legend = list(title = list(text= '<b>Order')))

pie_chart

Histogram Plots

histogram_plot = plot_ly(data = ca, x = ~(log(visitors)), name=~code,type="histogram") %>%
                  layout(
                          title= list(text = "<b>Total sleep time of Animals based on Vore"),
                          legend = list(title = list(text= '<b>Vore')),
                          xaxis  = list(title = list(text ='<b>Visitors')),
                          yaxis  = list(title = list(text ='<b>Count')))
histogram_plot

SEQU and LAVO national parks has more number of visitors

Donut Chart / Open Pie-Chart

df_vore = data.frame(table(ca$code))
df_vore
donut_chart = plot_ly(labels=df_vore$Var1, values=df_vore$Freq, 
                    textinfo='label+percent') %>%
                    add_pie(hole = 0.6) %>%
                    layout(
                          title= list(text = "<b>Order Distributions"),
                          legend = list(title = list(text= '<b>Order')))

donut_chart

SEQU, YOSE and KICA national parks has more number of visitors with respect to pie-chart.

Zomato data-set

Importing the data-set

library(dplyr)

Importing the data-set

df = read.csv('zomato.csv')
head(df)

Analysing the data-type of the data

names(df)
##  [1] "Restaurant.ID"        "Restaurant.Name"      "Country.Code"        
##  [4] "City"                 "Address"              "Locality"            
##  [7] "Locality.Verbose"     "Longitude"            "Latitude"            
## [10] "Cuisines"             "Average.Cost.for.two" "Currency"            
## [13] "Has.Table.booking"    "Has.Online.delivery"  "Is.delivering.now"   
## [16] "Switch.to.order.menu" "Price.range"          "Aggregate.rating"    
## [19] "Rating.color"         "Rating.text"          "Votes"
str(df)
## 'data.frame':    9551 obs. of  21 variables:
##  $ Restaurant.ID       : int  6317637 6304287 6300002 6318506 6314302 18189371 6300781 6301290 6300010 6314987 ...
##  $ Restaurant.Name     : chr  "Le Petit Souffle" "Izakaya Kikufuji" "Heat - Edsa Shangri-La" "Ooma" ...
##  $ Country.Code        : int  162 162 162 162 162 162 162 162 162 162 ...
##  $ City                : chr  "Makati City" "Makati City" "Mandaluyong City" "Mandaluyong City" ...
##  $ Address             : chr  "Third Floor, Century City Mall, Kalayaan Avenue, Poblacion, Makati City" "Little Tokyo, 2277 Chino Roces Avenue, Legaspi Village, Makati City" "Edsa Shangri-La, 1 Garden Way, Ortigas, Mandaluyong City" "Third Floor, Mega Fashion Hall, SM Megamall, Ortigas, Mandaluyong City" ...
##  $ Locality            : chr  "Century City Mall, Poblacion, Makati City" "Little Tokyo, Legaspi Village, Makati City" "Edsa Shangri-La, Ortigas, Mandaluyong City" "SM Megamall, Ortigas, Mandaluyong City" ...
##  $ Locality.Verbose    : chr  "Century City Mall, Poblacion, Makati City, Makati City" "Little Tokyo, Legaspi Village, Makati City, Makati City" "Edsa Shangri-La, Ortigas, Mandaluyong City, Mandaluyong City" "SM Megamall, Ortigas, Mandaluyong City, Mandaluyong City" ...
##  $ Longitude           : num  121 121 121 121 121 ...
##  $ Latitude            : num  14.6 14.6 14.6 14.6 14.6 ...
##  $ Cuisines            : chr  "French, Japanese, Desserts" "Japanese" "Seafood, Asian, Filipino, Indian" "Japanese, Sushi" ...
##  $ Average.Cost.for.two: int  1100 1200 4000 1500 1500 1000 2000 2000 6000 1100 ...
##  $ Currency            : chr  "Botswana Pula(P)" "Botswana Pula(P)" "Botswana Pula(P)" "Botswana Pula(P)" ...
##  $ Has.Table.booking   : chr  "Yes" "Yes" "Yes" "No" ...
##  $ Has.Online.delivery : chr  "No" "No" "No" "No" ...
##  $ Is.delivering.now   : chr  "No" "No" "No" "No" ...
##  $ Switch.to.order.menu: chr  "No" "No" "No" "No" ...
##  $ Price.range         : int  3 3 4 4 4 3 4 4 4 3 ...
##  $ Aggregate.rating    : num  4.8 4.5 4.4 4.9 4.8 4.4 4 4.2 4.9 4.8 ...
##  $ Rating.color        : chr  "Dark Green" "Dark Green" "Green" "Dark Green" ...
##  $ Rating.text         : chr  "Excellent" "Excellent" "Very Good" "Excellent" ...
##  $ Votes               : int  314 591 270 365 229 336 520 677 621 532 ...
bar_plot = ggplot(data=df, aes(x = `Rating.color`, y = ..count.. / sum(..count..),fill = factor(`Rating.color`))) +
           geom_bar(color='black') + 
           labs(x = "National Parks", y = "Percentage of National Parks in the data-set", 
                title = "Occurence of the National Parks in the data-set") +
  
           scale_y_continuous(labels = scales::percent)
           
bar_plot + coord_flip()

df %>%
  filter(Country.Code == 1) %>%
  select(Restaurant.ID) %>%
  unique() %>%
  nrow()
## [1] 8652

There are 8652 restaurants operating in India

df %>% filter(Country.Code == 1) %>% select(Restaurant.ID,City) %>% unique() %>% group_by(City) %>% summarise(n=n()) %>%
  ggplot(aes(x=reorder(`City`,n),y=n))+ geom_bar(stat = 'identity',fill='#cb202d') +
  coord_flip() +
  theme(panel.background = element_blank(),
        strip.background = element_blank(),
        axis.title = element_text(color = '#2d2d2d'),
        strip.text.x = element_text(color='#2d2d2d',face='bold',size=10),
        plot.title = element_text(hjust=0.5,face='bold',size=15))+
        labs(x='City',y='Number of Restaurants',title="Number of Restaurants by City")

There are more restaurants in Delhi which operates with Zomato

df_india = df %>% filter(Country.Code == 1 & City == 'Agra')
head(df_india)
ggplot(data=df_india, aes(x=Longitude, y=Latitude),
       color='white') + 
       geom_polygon() + 
       scale_fill_viridis_d(option='A')

Plot-4 –> gganimate

library(gganimate)
library(gifski)

Animate –> Scatter Plot

scatter_plot_animate = ggplot(data=ca, aes(year, visitors)) + geom_point() +                
                       transition_states(park_name)

animate(scatter_plot_animate, renderer = gifski_renderer())

Animate –> Line Plot

## filtering out only 'CHIS', 'DEVA' AND 'JOTR' Park_code
d = ca %>%
    filter(code%in%c('CHIS', 'DEVA', 'JOTR'))
d
line_plot = ggplot(d, aes(x=year, y=visitors, group=code, color=code)) + 
                   geom_line() + geom_point() + 
                   transition_reveal(year)

animate(line_plot, width=300, height=300, renderer = gifski_renderer())

Visitors started to visit Channel Islands National Park in 1960’s (latest among) has very low visitors till date. Joshua Tree National Park has a continuous growth of visitors without a big drop. Death Valley National Park has a continuous growth of visitors in 2010’s and a gradual pick-up.